Yanfei Kang
yanfeikang@buaa.edu.cn
School of Economics and Management
Beihang University
install.packages(c("stringr", "rvest", "knitr", "jiebaR", "wordcloud2",
"tm", "slam" , "proxy", "topicmodels", "RColorBrewer"))
Text Processing
# read text data into R
# WMTnews.txt can be found on my Github.
wmt.news <- readLines('WMTnews.txt')
# You can also read from the file I put online. It takes roughly 2 mins to read.
# wmt.news <- readLines("https://yanfei.site/docs/dpsa/WMTnews.txt",
# encoding = 'UTF-8')
length(wmt.news)## [1] 449
# print the first news article without quotes
noquote(wmt.news[1])## [1] 沃尔玛在中国强推综合工时制,引发多地门店员工罢工由于不满沃尔玛近期在中国推行的“综合工时制”改革,从7月1日开始,沃尔玛多地门店的基层员工发起罢工。据江西南昌当地媒体的报道,近日南昌沃尔玛八一广场店成了“闹市”,这里的沃尔玛员工正在集体罢工,工装背后贴着的A4纸上写着“沃尔玛员工站起来,抵制综合工时制度,反对欺骗,坚决维权。”据悉,之所以会发生这样的事,是因为沃尔玛要实行新的薪酬制度。员工称,他们本来是与沃尔玛签订了长期劳动合同,现在沃尔玛要求更改合同,本来的月薪制更改成小时制,并强制让员工签字。在大家看来,用工合同的更改,意味着他们的保障得到了根本性的改变。不仅如此,这些员工称,新的合同也变相提高了基本工资,从而规避城市基本工资上调政策,变相降低总体工资。除南昌外,成都、重庆、深圳、哈尔滨的个别商场员工也组织了罢工,以抗议这一次的“综合工时制”改革。由沃尔玛员工自发组织的中国员工联谊会介绍,此前沃尔玛在中国一直以来采用的是标准工时制,全职工每天工作8小时,每周工作5天,每周40小时;但沃尔玛自今年5月开始在中国各地分店推行综合计算工时工作制。新规则下,全职工每天工作4-11小时,每周工作3-6天,每周20-66小时,每月平均标准工时174小时、加班工时不超过36小时。沃尔玛员工认为,新规则可能导致工作时间安排不稳定,而且单方面实施新工时制度在程序上是违法的。知情人士告诉澎湃新闻,沃尔玛在美国实行的就是综合工时制,在中国推行这一改革也是为了与总部统一标准。在中国,综合工时制符合法律要求。《劳动法》共规定三种工时计算标准,即标准工时制、综合工时制和不定时工时制。标准工时制和综合工时制的区别在于,标准工时制以“天”为计算单位,而综合工时制以“周、月、年”为计算单位。也就是说,标准工时制是按照社会上最常见的8小时工作制,每周不超过40小时的标准来计算员工的工作量。而综合工时制是在每周不超过40小时的工作总量下,灵活分配每天的工作时间,工作长度。只是,从标准工时制改为综合工时制,还需要得到中国各地相关劳动主管部门的批准,而各地的要求也不尽相同。有的地方主管部门直接批准即可执行;有的地方主管部门则要求获得绝大部分员工的同意才可推行新政。因此,沃尔玛一些城市门店的基层员工需要签字同意这一改革。在这一过程中,部分员工对新政有各种各样的担忧,罢工事件由此爆发,沃尔玛在中国的“综合工时制”改革遭遇强大的挑战。据江西当地电视台7月3日报道,南昌市总工会的领导已经要求沃尔玛华中地区负责人向沃尔玛中国总部反映,恢复标准工时制。截止澎湃新闻发稿之时,尚未获得沃尔玛中国总部针对这一事件的回应。业内分析,沃尔玛在中国推行“综合工时制”改革的目的还是为了降低人力成本。沃尔玛这两年来一直在“做减法”,减去他们认为不利于管控、不利于标准化、不利于规模化、不利于降低成本的任何环节、商品、配置等。这种做法的好处在于,沃尔玛进一步加强中央管控,门店更加“听话”,并且可以节省成本,在利润上有直接体现。但这种做法也存在门店的本地化、个性化日益下降,商品竞争力逐渐下滑的弊端。(来源:澎湃新闻)进入【新浪财经股吧】讨论
# write text data into R
cat(wmt.news, file = "WMTnews.txt", sep = "\n")read.table(), write.table() etc.nchar()stringr::str_length()# number of characters in each news article
nchar(wmt.news)## [1] 1308 1005 1066 2886 440 270 2313 452 3099 683 3119 3140 397 2781
## [15] 419 460 2839 2519 2934 572 1181 156 1723 3301 2245 2401 2872 2849
## [29] 1226 2048 2324 3439 3182 1055 1698 1881 673 2877 1719 254 2342 770
## [43] 1325 771 1923 497 3145 139 2096 323 251 1652 495 301 1227 419
## [57] 1720 3219 736 3245 2602 722 1345 748 524 1537 924 62 54 632
## [71] 3143 3120 966 1790 962 940 2636 1497 964 250 218 338 1677 346
## [85] 366 1679 841 2011 866 1314 393 664 1708 1320 1977 1592 285 833
## [99] 731 336 1882 3241 2270 1251 1455 215 2264 345 344 955 782 267
## [113] 411 0 683 158 285 1155 168 382 1759 5446 292 651 3927 578
## [127] 607 144 142 540 867 1136 1874 654 539 141 42 1033 229 140
## [141] 1246 892 1442 863 2293 289 2583 142 564 773 694 121 122 1958
## [155] 1251 1284 928 2195 493 1334 2537 1724 1229 728 1552 548 3479 763
## [169] 1907 0 789 997 626 797 855 560 882 1166 48 552 52 729
## [183] 248 701 624 632 751 356 542 346 1054 2617 237 3197 582 827
## [197] 1366 491 1016 538 956 3938 963 1188 0 2352 1176 1247 2533 1249
## [211] 2039 1426 463 456 1042 128 551 1589 296 468 3950 198 855 870
## [225] 254 660 207 599 362 430 442 285 843 735 5897 308 149 300
## [239] 598 592 399 835 2299 866 840 211 379 397 1415 456 945 439
## [253] 1485 995 242 2253 239 399 879 1630 690 825 740 201 300 71
## [267] 52 1246 653 998 554 1623 1134 1138 1066 360 302 709 828 159
## [281] 598 170 420 432 3448 513 292 3305 136 1883 2184 794 534 782
## [295] 1919 1527 1562 1638 811 0 931 576 1168 1218 2130 798 291 465
## [309] 720 4068 563 1806 90 25 1014 72 1496 468 0 1571 1769 2775
## [323] 972 3515 1898 181 1263 376 92 1903 138 0 410 2128 465 575
## [337] 1740 583 856 879 1214 398 1084 1114 2146 341 2159 480 1952 469
## [351] 1127 113 95 2922 132 872 5488 205 137 145 706 469 470 1095
## [365] 231 1093 1068 950 650 205 887 380 25 1297 1642 821 3251 830
## [379] 2028 0 522 146 347 560 598 662 400 218 1965 1627 1551 235
## [393] 182 777 534 1048 7061 414 167 2986 1214 425 1107 1138 1229 1186
## [407] 1216 0 727 1758 773 2073 588 864 299 415 0 3256 741 3691
## [421] 403 1979 2050 0 0 661 1458 1231 1445 814 799 2477 1920 1493
## [435] 166 554 114 711 1456 2669 0 1183 1123 209 1527 115 297 1425
## [449] 576
# library(stringr); str_length(wmt.news)paste()stringr::str_c()# concatenate characters
paste('2015', '06-04', sep = '-')## [1] "2015-06-04"
paste('2015', c('06-04', '06-05'), sep = '-')## [1] "2015-06-04" "2015-06-05"
paste('2015', c('06-04', '06-05'), sep = '-', collapse = ' ')## [1] "2015-06-04 2015-06-05"
# str_c() in stringr
library(stringr)
str_c('2015', '06-04', '00:00', sep = '-')## [1] "2015-06-04-00:00"
# frequently used in web scraping
paste('http://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京&kw=',
'阿里巴巴', sep = '')## [1] "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京&kw=阿里巴巴"
paste('http://sou.zhaopin.com/jobs/searchresult.ashx?jl=',
'上海', '&kw=', '阿里巴巴', sep = '')## [1] "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=上海&kw=阿里巴巴"
sprintf() is a superior choice over paste.# combine text and variable values
comp <- '阿里巴巴'
job.location <- '上海'
sprintf('http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%s&kw=%s',
job.location, comp)## [1] "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=上海&kw=阿里巴巴"
strsplit()stringr::str_split()# split characters
dates <- c('2015-06-04', '2015-06-05')
strsplit(dates, "-")## [[1]]
## [1] "2015" "06" "04"
##
## [[2]]
## [1] "2015" "06" "05"
strsplit('2015-06-04', '-')## [[1]]
## [1] "2015" "06" "04"
# another way
library(stringr)
str_split(dates, '-')## [[1]]
## [1] "2015" "06" "04"
##
## [[2]]
## [1] "2015" "06" "05"
str_split('2015-06-04', '-')## [[1]]
## [1] "2015" "06" "04"
# search for matches
mySentences <- c('沃尔玛还与微信跨界合作,顾客可通过沃尔玛微信服务号的付款功能在实体门店秒付买单。',
'沃尔玛移动支付应用已经部署在其全美4,600家超市中。')
grep('沃尔玛', mySentences)## [1] 1 2
grepl('沃尔玛', mySentences)## [1] TRUE TRUE
library(stringr); str_detect(mySentences, '沃尔玛')## [1] TRUE TRUE
regexpr('沃尔玛', mySentences)## [1] 1 1
## attr(,"match.length")
## [1] 3 3
gregexpr('沃尔玛', mySentences)## [[1]]
## [1] 1 18
## attr(,"match.length")
## [1] 3 3
##
## [[2]]
## [1] 1
## attr(,"match.length")
## [1] 3
# replace white spaces
messySentences <- c('沃尔玛还与微信 跨界合作,顾客可通过沃尔玛微信服务号的付 款功能在实体门店秒付买单。',
'沃尔玛移动支付应 用已经部 署在其全美4,600家超市中。')
# patten replacement
# sub(pattern, replacement, x, ...)
sub(' ', '', messySentences)## [1] "沃尔玛还与微信 跨界合作,顾客可通过沃尔玛微信服务号的付 款功能在实体门店秒付买单。"
## [2] "沃尔玛移动支付应用已经部 署在其全美4,600家超市中。"
# gsub(pattern, replacement, x, ...)
gsub(' ', '', messySentences)## [1] "沃尔玛还与微信跨界合作,顾客可通过沃尔玛微信服务号的付款功能在实体门店秒付买单。"
## [2] "沃尔玛移动支付应用已经部署在其全美4,600家超市中。"
# extract substrings: substr(x, start, stop)
x <- c('月薪:5000元', '月薪:8000元')
substr(x,4,7)## [1] "5000" "8000"
Load text from the https://yanfei.site/docs/dpsa/BABAnews.txt and print it on screen. Text file contains some of the news of Alibaba.
How many paragraphs are there in the article?
Trim leading whitespaces of each paragraph (try ??trim).
How many characters are there in the article?
Collapse paragraphs into one and display it on the screen (un-list it).
Does the text contain word ‘技术架构’?
Split the article into sentences (by periods).
Replace ‘双11’ with ‘双十一’.
# 1
baba.news <- readLines("https://yanfei.site/docs/dpsa/BABAnews.txt",
encoding = 'UTF-8')
noquote(baba.news)
# 2
length(baba.news)
# 3
baba.news <- str_trim(baba.news)
# 4
nchar(baba.news)
# 5
baba.news.collapsed <- paste(baba.news, collapse = "")
cat(baba.news.collapsed)
# 6
grepl("技术架构", baba.news.collapsed)
grep('技术架构', baba.news)
# 7
strsplit(baba.news.collapsed, "。")
# 8
gsub("双11", "双十一", baba.news.collapsed)Please see Text processing on Wiki for more details, examples, R packages and R functions used for text processing in R.
Web Scraping
| Movie | Score | Length (mins) | Language |
|---|---|---|---|
| 爱乐之城 | 8.4 | 128 | English |
| 看不见的客人 | 8.7 | 106 | Spanish |
| … | … | … | … |
When we do web scraping, we deal with html tags to find the path of the information we want to extract.
A simple html source code: tree structure of html tags. HTML tags normally come in pairs.
<!DOCTYPE html>
<html>
<title> My title
</title>
<body>
<h1> My first heading </h1>
<p> My first paragraph </p>
</body>
</html>
<!DOCTYPE html> : HTML documents must start with a type declaration<html> and </html><body> and </body><h1> to <h6> tags<p> tagHTML links are defined with the <a> tag
<a href="http://www.test.com">This is a link for test.com</a>HTML tables are defined with <table>, row as <tr> and rows are divided into data as <td>
<table style="width:100%">
<tr>
<td> 中文名称 </td>
<td> 英文名称 </td>
<td> 简称 </td>
</tr>
<tr>
<td> 北京航空航天大学 </td>
<td> Beihang University </td>
<td> 北航 </td>
</tr>
</table>HTML list starts with <ul> (unordered) and <ol> (ordered). Each item of list starts with <li>
<ol>
<li> 科技获奖 </li>
<li> 服务国家战略 </li>
<li> 标志性成果 </li>
</ol>You can try http://www.tryiteditor.com to learn more about html.
<!DOCTYPE html>
<html>
<title> My title
</title>
<body>
<h1> My first heading </h1>
<p> My first paragraph </p>
</body>
</html>
/html/title: selects the <title> element of an HTML document//p: selects all the <p> elements<html>
<head>
<base href='http://example.com/' />
<title>Example website</title>
</head>
<body>
<div id='images', class='img'>
<a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg'/></a>
<a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg'/></a>
<a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg'/></a>
<a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg'/></a>
<a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg'/></a>
</div>
<div>
<a href='img.html'> text <img src='img.jpg'/></a>
</div>
</body>
</html>
//div[@id="images"]: selects all the <div> elements which contain an attribute id="images". Note its difference with //div
//div[@class="img"]//body/div[1]//div[@id="images"]/a/: selects all the <a> elements inside the aforementioned element.<td class="zwmc" style="width: 250px;">
<div style="width: 224px;*width: 218px; _width:200px; float: left">
<a style="font-weight: bold">金融分析师</a>
</div>
</td>
<a> element from the source above.//td[@class="zwmc"]/div/a//td[@class="zwmc"]//aScrape job information from http://sou.zhaopin.com of jobs related to ‘阿里巴巴’.
//td[@class="zwmc"]/div/a.Can you find xpath for companies, salaries, locations and links?
read_html().html_nodes(). It pull out the entire node.html_table(): extract all data inside a html table.html_text(): extract all text within the node.html_attr(): extract contents of a single attribute.html_attrs(): extract all attributes.library(rvest)
web <- read_html('<!DOCTYPE html>
<html>
<title> My title
</title>
<body>
<h1> My first heading </h1>
<p> My first paragraph </p>
</body>
</html>')
title_node <- html_nodes(web, xpath = '//title')
title_node## {xml_nodeset (1)}
## [1] <title> My title\n </title>
html_text(title_node)## [1] " My title\n "
str_trim(html_text(title_node))## [1] "My title"
url <- "https://en.wikipedia.org/wiki/Provinces_of_China"
web <- read_html(url)
provinces_nodes <-
html_nodes(web, xpath = '//*[@class="wikitable sortable"]')
provinces <- html_table(provinces_nodes)
library(knitr)
kable(head(provinces[[1]]), format = "html")| GB[2] | ISO[3] | Province | Chinese Hanyu Pinyin | Capital | Population1 | Density2 | Area3 | Abbreviation4 |
| BJ | CN-11 | Beijing Municipality | 北京市Běijīng Shì | Beijing | 19,612,368 | 1,167.40 | 16,800 | 京Jīng |
| TJ | CN-12 | Tianjin Municipality | 天津市Tiānjīn Shì | Tianjin | 12,938,224 | 1,144.46 | 11,305 | 津Jīn |
| HE | CN-13 | Hebei Province | 河北省Héběi Shěng | Shijiazhuang | 71,854,202 | 382.81 | 187,700 | 冀Jì |
| SX | CN-14 | Shanxi Province | 山西省Shānxī Shěng | Taiyuan | 35,712,111 | 228.48 | 156,300 | 晋Jìn |
| NM | CN-15 | Inner Mongolia Autonomous Region | 內蒙古自治区Nèi Měnggǔ Zìzhìqū | Hohhot | 24,706,321 | 20.88 | 1,183,000 | 內蒙古(蒙)Nèi Měnggǔ (Měng) |
url <- "https://en.wikipedia.org/wiki/Provinces_of_China"
web <- read_html(url)
references_nodes <-
html_nodes(web, xpath = '//div[@class="reflist"]//li')
references <- html_text(references_nodes)
references## [1] "^ Administrative divisions of China"
## [2] "^ GB/T 2260 codes for the provinces of China"
## [3] "^ ISO 3166-2:CN (ISO 3166-2 codes for the provinces of China)"
library(rvest)
url <- 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京&kw=阿里巴巴'
web <- read_html(url)
job_title_nodes <- html_nodes(web, xpath = '//td[@class="zwmc"]/div/a')
length(job_title_nodes)## [1] 60
job_title <- html_text(job_title_nodes)
job_title[1:5]## [1] "阿里妈妈-高级渠道商业专家-北京" "优酷-社区运营-粉丝运营-大运营中心"
## [3] "优酷-互动运营-大运营中心" "优酷-优酷社区明星运营-用户运营"
## [5] "优酷-直客销售经理(本土)"
link <- html_attr(job_title_nodes, 'href')
link[1:5]## [1] "http://jobs.zhaopin.com/000127917285307.htm"
## [2] "http://jobs.zhaopin.com/000127917285305.htm"
## [3] "http://jobs.zhaopin.com/000127917285303.htm"
## [4] "http://jobs.zhaopin.com/000127917285301.htm"
## [5] "http://jobs.zhaopin.com/00012791790284766000.htm"
job_title_nodes <- html_nodes(web, xpath = '//td[@class="zwmc"]/div/a')
job_title <- html_text(job_title_nodes)
\(\Downarrow\)
job_title <- web %>%
html_nodes(xpath = '//td[@class="zwmc"]/div/a') %>%
html_text()
library(rvest)
url <- 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京&kw=阿里巴巴'
web <- read_html(url, encoding = "utf-8")
job_title <- web %>%
html_nodes(xpath = '//td[@class="zwmc"]/div/a') %>%
html_text()
link <- web %>%
html_nodes(xpath = '//td[@class="zwmc"]/div/a') %>%
html_attr('href')
# link = paste('[Link](', link, sep='')
# link <- paste(link, ')', sep='')
company <- web %>%
html_nodes(xpath = '//td[@class="gsmc"]') %>%
html_text()
salary <- web %>%
html_nodes(xpath = '//td[@class="zwyx"]') %>%
html_text()
location <- web %>%
html_nodes(xpath = '//td[@class="gzdd"]') %>%
html_text()
alibaba_jobs <- data.frame(job_title, company, salary, location, link)
library(knitr)
kable(head(alibaba_jobs), format = "html")| job_title | company | salary | location | link |
|---|---|---|---|---|
| 阿里妈妈-高级渠道商业专家-北京 | 阿里巴巴集团 | 面议 | 北京 | http://jobs.zhaopin.com/000127917285307.htm |
| 优酷-社区运营-粉丝运营-大运营中心 | 阿里巴巴集团 | 面议 | 北京 | http://jobs.zhaopin.com/000127917285305.htm |
| 优酷-互动运营-大运营中心 | 阿里巴巴集团 | 面议 | 北京 | http://jobs.zhaopin.com/000127917285303.htm |
| 优酷-优酷社区明星运营-用户运营 | 阿里巴巴集团 | 面议 | 北京 | http://jobs.zhaopin.com/000127917285301.htm |
| 优酷-直客销售经理(本土) | 阿里巴巴集团 | 面议 | 北京 | http://jobs.zhaopin.com/00012791790284766000.htm |
| 阿里文学-资深内容运营经理 | 阿里巴巴集团 | 面议 | 北京 | http://jobs.zhaopin.com/000127917285288.htm |
Think about how to turn pages?
library(stringr)
get_job_detail <- function(link){
link = as.character(link)
web = read_html(link)
experience = web %>%
html_nodes(xpath = '//ul[@class="terminal-ul clearfix"]/li[5]/strong') %>%
html_text()
degree = web %>%
html_nodes(xpath = '//ul[@class="terminal-ul clearfix"]/li[6]/strong') %>%
html_text()
number = web %>%
html_nodes(xpath = '//ul[@class="terminal-ul clearfix"]/li[7]/strong') %>%
html_text()
description = web %>%
html_nodes(xpath = '//div[@class="terminalpage-main clearfix"]/div/div[1]')%>%
html_text()
description = sub('查看职位地图', '', description)
description = sub('工作地址:', '', description)
description = sub('北京', '', description)
description = str_trim(description)
link_details = data.frame(experience, degree, number, description)
return(link_details)
}
job_details <- data.frame()
for (i in 1:nrow(alibaba_jobs)){
job_details = rbind(job_details, get_job_detail(alibaba_jobs$link[i]))
}
alibaba_job_details <- cbind(alibaba_jobs, job_details)
kable(head(subset(alibaba_job_details, select = -description)), format = "html")| job_title | company | salary | location | link | experience | degree | number |
|---|---|---|---|---|---|---|---|
| 阿里妈妈-高级渠道商业专家-北京 | 阿里巴巴集团 | 面议 | 北京 | http://jobs.zhaopin.com/000127917285307.htm | 3-5年 | 本科 | 若干 |
| 优酷-社区运营-粉丝运营-大运营中心 | 阿里巴巴集团 | 面议 | 北京 | http://jobs.zhaopin.com/000127917285305.htm | 3-5年 | 本科 | 若干 |
| 优酷-互动运营-大运营中心 | 阿里巴巴集团 | 面议 | 北京 | http://jobs.zhaopin.com/000127917285303.htm | 3-5年 | 本科 | 若干 |
| 优酷-优酷社区明星运营-用户运营 | 阿里巴巴集团 | 面议 | 北京 | http://jobs.zhaopin.com/000127917285301.htm | 3-5年 | 本科 | 若干 |
| 优酷-直客销售经理(本土) | 阿里巴巴集团 | 面议 | 北京 | http://jobs.zhaopin.com/00012791790284766000.htm | 3-5年 | 本科 | 若干 |
| 阿里文学-资深内容运营经理 | 阿里巴巴集团 | 面议 | 北京 | http://jobs.zhaopin.com/000127917285288.htm | 5-10年 | 本科 | 若干 |
Please choose one from the following exercises.
Extract names, research interests, emails and links of all BUAA SEM Professors (http://sem.buaa.edu.cn/szdw/jsbd.htm)
Extract at least 5 attributes of the movies listed on Douban top 250 (https://movie.douban.com/top250)
Extract the top 5 pages of hotel information including the newest reviews from TripAdvisor (https://www.tripadvisor.com/Hotels-g294212-Beijing-Hotels.html)
Extract the top 5 pages of book information from Amazon (https://www.amazon.cn/s/ref=nb_sb_noss?__mk_zh_CN=亚马逊网站&field-keywords=大数据)
When you scrape a website too frequently, the server may reject your request. One possible solution is to stop for several seconds irregularly.
Not every website is scrappable! Some websites go with really high technoloy to protect their data from being extracted. For example, they use javascript, or really complex captcha codes.
Python has more functionality for web scraping. It is more flexible to deal with the problems mentioned above. If you are interested in that, please refer to this book. Basics of web scraping with Python are similar.
Text Mining
# read text data into R
alibaba_job_description <- as.character(alibaba_job_details$description)# load the word segment package
library(jiebaR)
# build a segment engine
engine1 <- worker(stop_word = 'stopwords.txt')
# add news words into the engine
new_user_word(engine1, c("新技术", '新媒体'))## [1] TRUE
# for each job description, perform word segmentation
Words <- c()
for(i in 1:length(alibaba_job_description)){
Words <- c(Words,c(segment(alibaba_job_description[i], engine1)))
}
# we need to consider other stopwords in this specific case
myStopwords <- c('工作', '地址', '公司', '岗位', '描述',
'负责', '职位', '年', '优先', '具备',
'熟悉','相关')
Words <- Words[-which(Words %in% myStopwords)]
# remove all the numbers
Words <- gsub("[0-9]+?",'', Words)
# only keep terms
Words <- Words[nchar(Words) > 1]
head(Words)## [1] "互联网" "主流" "媒体" "生意" "规则" "利用"
# word frequencies
wordsNum <- table(unlist(Words))
wordsNum <- sort(wordsNum, decreasing = TRUE)
words.top150 <- head(wordsNum,150)
library(RColorBrewer); colors <- brewer.pal(8,"Dark2")
# Sys.setlocale("LC_CTYPE")
library(wordcloud2)
wordcloud2(words.top150, color = "random-dark", shape = 'circle', backgroundColor = 'white')# letterCloud(words.top150, word= "A", color = "random-light",
# backgroundColor = "white",size = 0.3)toolsNum <- wordsNum[substr(names(wordsNum),1,1)%in% c("A","B","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z")]
tools.top50 <- head(toolsNum, 50)
my.english.stopwords <- c('IN', 'OF', 'TO', 'AND', 'WITH', 'FOR',
'THE', 'WORKING', 'WORK')
tools.top <- tools.top50[!(names(tools.top50) %in% my.english.stopwords)]
wordcloud2(tools.top, color = "random-dark", shape = 'circle', backgroundColor = 'white')# load the text mining packages
library(tm)
library(slam)
wmt.news <- readLines("WMTnews.txt")
# wmt.news <- readLines("https://yanfei.site/docs/dpsa/WMTnews.txt", encoding = 'UTF-8')
# build word segment engine
mixseg <- worker(stop_word = "stopwords.txt")
# mixseg <- worker(stop_word = "https://yanfei.site/docs/dpsa/stopwords.txt")
mixseg$bylines <- TRUE
# word segmentation for each of the 449 articles
word_list <- mixseg[wmt.news]
f <- function(x){
x <- gsub("[0-9]+?",'', x)
x[x == '号店'] <- '1号店'
x <- paste(x[nchar(x) > 1], collapse = ' ')
return(x)
}
# cleanup
d.vec <- lapply(word_list,f)
corpus <- Corpus(VectorSource(d.vec))
# remove stopwords
myStopwords <- c('新浪', '沃尔玛', '年', '月', '日','公司', '中国', '有限公司')
stopwords <- readLines('stopwords.txt')
mycorpus <- tm_map(corpus,removeWords,c(stopwords, myStopwords))
# creat DocumentTermMatrix
control <- list(removePunctuation=T,
wordLengths = c(2, Inf),
stopwords = c(stopwords, myStopwords))
d.dtm <- DocumentTermMatrix(mycorpus, control)
d.dtm <- d.dtm[row_sums(d.dtm)!=0, ]
# remove sparse ones
d.dtm.sub <- removeSparseTerms(d.dtm, sparse = 0.99)
# text clustering
library(proxy)
d.dist <- proxy::dist(as.matrix(d.dtm.sub), method='cosine')
fit <- hclust(d.dist, method="ward.D")
memb <- cutree(fit, k = 2)
plot(fit)findFreqTerms(d.dtm.sub[memb==1, ], 300)## [1] "商品" "美国" "门店" "市场" "服务" "记者" "亿美元"
## [8] "会员" "增长" "消费者" "超市" "企业" "全球" "销售"
## [15] "零售" "食品" "业务" "山姆" "电商"
findFreqTerms(d.dtm.sub[memb==2, ], 300)## [1] "合作" "业务" "京东" "电商" "1号店"
library(topicmodels)
ctm <- topicmodels::CTM(d.dtm.sub, k = 2)
terms(ctm, 2, 0.01)## Topic 1 Topic 2
## [1,] "门店" "京东"
## [2,] "食品" "1号店"
XML, RCurl and scrapeR are also used for web scraping.